#This is to get
#Figure5b
rm(list = ls())
###########
#Libraries#
###########
library("ggplot2")
library("glue")
library('gridExtra')
library('grid')
library('lfe')
library("rddtools")

setwd("/Users/bgpopescu/Dropbox/Legacies_Central_Europe/")
setwd("C:/Users/bogdanp/Dropbox/Legacies_Central_Europe/")

##########
#Read CSV#
##########
data_1910 <- read.csv(file = './data/data_ro_1910.csv')


###################
#Defining Distance#
###################

data_1910$dist1 <- as.numeric( with (data_1910,ifelse(data_1910$treat==1, 1, -1)))
data_1910$dist2 <-data_1910$dist1*data_1910$Ott_Habs_brd_distance

data_1910$bfe1<-ifelse(data_1910$Ott_Habs_brd_NEAR_FID==1, 1, 0)
data_1910$bfe2<-ifelse(data_1910$Ott_Habs_brd_NEAR_FID==9, 1, 0)
data_1910$bfe3<-ifelse(data_1910$Ott_Habs_brd_NEAR_FID==10, 1, 0)
data_1910$bfe4<-ifelse(data_1910$Ott_Habs_brd_NEAR_FID==11, 1, 0)
data_1910$bfe5<-ifelse(data_1910$Ott_Habs_brd_NEAR_FID==12, 1, 0)
data_1910$bfe6<-ifelse(data_1910$Ott_Habs_brd_NEAR_FID==13, 1, 0)
data_1910$bfe7<-ifelse(data_1910$Ott_Habs_brd_NEAR_FID==14, 1, 0)
data_1910$bfe8<-ifelse(data_1910$Ott_Habs_brd_NEAR_FID==15, 1, 0)

mean(data_1910$pct_literate[data_1910$treat==0], na.rm=T)
mean(data_1910$pct_literate[data_1910$treat==1], na.rm=T)


# Set outcome, forcing and cutoff variable
rdd2_obj1 = rdd_data(y = pct_literate, x = dist2, 
                     cutpoint = 0,
                     data = data_1910)

# Restrict sample to bandwidth area
bw_ik <- rdd_bw_ik(rdd2_obj1)
optimal_band<-round(bw_ik)


####################
# Helper functions #
####################
comp_dist <- function(data_sample, y, x, spec, title_spec) {
  # Takes:
  #   - data_sample, a dataframe with the relevant data
  #   - y, a string with my dependent variable
  #   - x, a list of strings with my independent variables
  #   - spec, model specification
  #   - title_spec, label for the model specification (to appear on
  #                 the title of the graph)
  # Returns:
  #   Nothing, creates one picture in the folder. This function
  #   compares bandwidths in increments of 1 Km for the outcomes
  #   of interest (y) and the specified model. 
  
  # Define range of potential bandwidths
  sequence = seq(from = 35, to = 120, by = 1)
  
  #Create empty variables
  confint1 = rep(NA, length(sequence))
  confint2 = rep(NA, length(sequence))
  confint3 = rep(NA, length(sequence))
  confint4 = rep(NA, length(sequence))
  coef1 = rep(NA, length(sequence))
  
  
  # This loop calculates local linear estimates
  # for bandwidths from 15 to 150 kilometers
  specification = as.formula(paste(glue("{y}~"), 
                                   paste(x, collapse = "+")))

  for (i in 1 : length(sequence)){
    band = data_sample[which(data_sample$dist2 > -sequence[i] & 
                               data_sample$dist2 < sequence[i]),]
    model <- felm(specification, paste(glue("|0|0|0")), data = band)
    confint1[i] = confint(model)[2,1]
    confint2[i] = confint(model)[2,2]
    confint3[i] = confint(model, level = 0.90)[2,1]
    confint4[i] = confint(model, level = 0.90)[2,2]  
    coef1[i] = coef(model)[2]
  }

  model_optimal <- felm(specification, paste(glue("|0|0|0")), data = subset(data_sample, Ott_Habs_brd_distance<optimal_band))
  estimates <- coef(model_optimal)
  estimate <-estimates[grep("treat", names(estimates))]
  se <- sqrt(diag(vcov(model_optimal)))
  se_est <- model_optimal$se
  se_est_treat <- se_est[grep("treat", names(estimates))]
  
  
  df <- data.frame(sequence = sequence, coef1 = coef1, 
                   confint1 = confint1, confint2 = confint2,
                   confint3 = confint3, confint4 = confint4)
  
  #Removing coefficients outside of the y-axis limits
  condition<-(df$confint1<(-40) | df$confint2>5 |
                df$confint3<(-40) | df$confint4>5 )  
  
  df$coef1<-ifelse(condition, NA, df$coef1)
  
  mean_feature <- ggplot(data = df, aes(sequence, coef1)) +
    geom_point(size = 1.3) +
    geom_point(data = df[df$sequence==round(bw_ik),], color= "red", size = 4)+ 
    geom_errorbar(aes(ymin = confint1, ymax = confint2), size = 0.2, width = 0)+
    geom_errorbar(aes(ymin = confint3, ymax = confint4), size = 0.5, width = 0)+
    geom_errorbar(data = df[df$sequence==round(bw_ik),],
                  aes(ymin=confint1, ymax=confint2), size=0.4, width = 0, color= "red")+
    geom_errorbar(data = df[df$sequence==round(bw_ik),],
                  aes(ymin=confint3, ymax=confint4), size=1, width = 0, color= "red")+
    geom_hline(yintercept = 0) +
    theme_bw() +
    scale_x_continuous(name = "Bandwidth (km)", breaks=seq(35,120,20)) +
    scale_y_continuous(name = "Estimate", limits = c(-40, 5)) +
    ggtitle(glue("{title_spec}")) +
    theme(plot.title = element_text(hjust = 0.5),
          axis.text.x = element_text(size=14),
          axis.text.y = element_text(size=14),
          axis.title=element_text(size=16))+
    annotate(geom = "text", x=-Inf, y=Inf, label = c(paste(" Habsburg N:", nrow(data_sample[data_sample$treat==0,]),"\n", 
                                                           "Ottoman N:", nrow(data_sample[data_sample$treat==1,]))), hjust = 0, vjust = 1)+
    annotate(geom = "text", x=Inf, y=Inf, label = c(paste("Mean:", round(mean(data_sample[[y]], na.rm=TRUE),2),"\n", 
                                                          "SD:", round(sd(data_sample[[y]], na.rm=TRUE),2))), hjust = 1, vjust = 1)+
    annotate(geom = "text", x=Inf, y=-Inf, label = c(paste("Opt. Band:", optimal_band, " km", "\n", 
                                                           "Estimate:", round(estimate,2),"\n",
                                                           "Standard Error:", round(se_est_treat,2))), hjust = 1, vjust = 0)
  
  #return(df)
  return(mean_feature)
}

x<-comp_dist(data_1910, "pct_literate", "treat", "treat", "treat")

create_graphs_per_specifications <- function(regressors, specifications, 
                                             specification_label, data_sample, data_sample_name){
  # Takes:
  #   - regressors, a list of dependent variables
  #   - specifications, lists of model specifications (indep variables)
  #   - specification_label, a list of labels for model specifications
  #   - data_sample, the dataset of interest
  #   - data_sample_name, label for the data_sample
  
  # Returns:
  #   Creates one grid of pictures in the folder. This function
  #   allows us to vary the regressors.
  
  for (y in regressors) {
    regressor_name  = y[[1]]
    regressor_label = y[[2]]
    figure_name = y[[3]]
    
    print(glue("Using dependent variable {regressor_label}.."))
    graphs <- list()
    for (spec in names(specifications)){
      print(spec)
      print(specification_label[[spec]])
      temp_graph <- comp_dist(data_sample, regressor_name, 
                              specifications[[spec]], spec, 
                              specification_label[[spec]])
      graphs[[paste(c(spec, regressor_name), collapse = '_')]] <- temp_graph
    }
    
    #Ncol may need to be changed
    grid <- grid.arrange(grobs = graphs, nrow = 4, ncol = 1,
                         top = textGrob({},
                                        gp = gpar(fontsize = 20, font = 2)))
    ggsave(grid, file = glue("{figure_name}{data_sample_name}.jpg"), 
           height = 35, width = 12, 
           units = "cm", dpi = 300)
  }
  
}

optimal_band
data_1910$Ott_Habs_brd_distance

########
#MODELS#
########

# Define model specifications
specifications <- list(c('treat'),
                       c('treat', 'POINT_X', 'POINT_Y', 'bfe1', 'bfe2', 'bfe3', 'bfe4', 'bfe5', 'bfe6', 'bfe7'),
                       c('treat', 'Ott_Habs_brd_distance', 'bfe1', 'bfe2', 'bfe3', 'bfe4', 'bfe5', 'bfe6', 'bfe7'),
                       c('treat', 'POINT_X', 'POINT_Y', 'Ott_Habs_brd_distance', 'bfe1', 'bfe2', 'bfe3', 'bfe4', 'bfe5', 'bfe6', 'bfe7'))


names(specifications) <- c('mean', 'meanbfeXY', "meanbfedist", "meanbfeXYdist")

specification_label<-c("Treat", 
                       "Treat + Lat-Lon + BFE", 
                       "Treat + Dist. Brd + BFE",
                       "Treat + Lat-Lon + Dist. Brd + BFE")

names(specification_label)<-c('mean', 'meanbfeXY', "meanbfedist", "meanbfeXYdist")
regressors<-list(c("pct_literate", "Pct. Literate", ""))


shortcut = glue("./Paper/graphs/")
setwd(shortcut)

create_graphs_per_specifications(regressors, specifications, 
                                 specification_label, data_1910, 'figure5b')
